started: Alexey Larionov, 2016
last updated: Alexey Larionov, 27Aug2017

Summary

Calculates AFs and HWE for 480 cases (before calculating eigenvectors)
Removes 684 variants violating HWE (p <10-4 : threshold recommended by EZ)
Some of these could be valid multiallelic variants - not verifyed at this occasion

Eigenvectors are calculated using 44,508 common variants only:
5% < AF < 95% in each of the compared datasets (UBC and CBC)

Requires accessory scripts f01_calculate_eigenvectors.R and f03_qqunif_plot.R

Suggests two eigenvectors’ outliers (> 6 SdtDev on 2nd EV): P5_E09 and P6_D05
Additionally there are two outliers along the 4th EV: P2_C08 and P4_F10

Input data: 239,642 vars x 480 cases (245 UBC and 235 CBC)
Output data: 238,958 vars x 480 cases (245 UBC and 235 CBC)

start_section

# Time stamp
Sys.time()
## [1] "2017-08-27 19:10:19 BST"
# Clenan-up
rm(list=ls())

# Libraries location
.libPaths("/home/alarionov/R/my_libs_r3.4.1/")

# Base folder
library(knitr)
base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_08.17"
opts_knit$set(root.dir = base_folder)
#setwd(base_folder)

# Required libraries
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(HardyWeinberg)
## Loading required package: mice
# Accessory functions
source(paste(base_folder, "scripts", "f03_qqunif_plot.R", sep="/"))
source(paste(base_folder, "scripts", "f01_calculate_eigenvectors.R", sep="/"))

# Filtering settings
hwe_th <- 0.0001 

load_data

#base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_08.17"
load(paste(base_folder, "results", "r04_filter_cases_and_variants_wecare_only.RData", sep="/"))

check_data

ls()
## [1] "base_folder"                             
## [2] "exac.df"                                 
## [3] "genotypes.mx"                            
## [4] "hwe_th"                                  
## [5] "kgen.df"                                 
## [6] "normalise_and_calculate_eigenvectors.udf"
## [7] "phenotypes.df"                           
## [8] "qqunif.plot"                             
## [9] "variants.df"
dim(genotypes.mx)
## [1] 239642    480
class(genotypes.mx)
## [1] "matrix"
genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000001      0      1      0      0      0
## Var000000002      0      0      0      0      0
## Var000000008      0      0     NA      0     NA
## Var000000020      0      0      0      0      0
## Var000000021      0      0      0      0      0
dim(phenotypes.df)
## [1] 480  31
str(phenotypes.df)
## 'data.frame':    480 obs. of  31 variables:
##  $ wes_id        : chr  "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##  $ gwas_id       : chr  "id203568" "id357807" "id325472" "id304354" ...
##  $ merged_id     : chr  "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
##  $ filter        : chr  "pass" "pass" "pass" "pass" ...
##  $ cc            : int  1 1 1 1 1 1 0 1 1 0 ...
##  $ setno         : int  203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
##  $ registry      : Factor w/ 5 levels "de","IA","IR",..: 3 4 2 2 5 4 2 4 4 2 ...
##  $ family_history: int  1 0 1 1 1 1 1 1 1 0 ...
##  $ age_dx        : int  49 35 32 33 44 28 28 38 35 36 ...
##  $ age_ref       : num  58 36 41 34 49 28 32 44 35 38 ...
##  $ rstime        : num  10.17 1.83 9.75 1.59 5.66 ...
##  $ eig1_gwas     : num  -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
##  $ eig2_gwas     : num  0.00266 0.00246 -0.0001 0.00595 0.01157 ...
##  $ eig3_gwas     : num  0.06803 0.05055 -0.00603 0.00747 0.00144 ...
##  $ eig4_gwas     : num  -0.03469 -0.01264 -0.01269 -0.01841 0.00663 ...
##  $ eig5_gwas     : num  -0.03845 -0.00424 0.01597 -0.0065 0.01391 ...
##  $ stage         : num  1 2 2 1 1 1 2 1 2 1 ...
##  $ er            : num  NA 0 0 0 NA 1 1 1 1 0 ...
##  $ pr            : num  NA 0 0 NA NA 1 NA 1 0 0 ...
##  $ hist_cat      : chr  "lobular" "ductal" "medullary" "ductal" ...
##  $ hormone       : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ chemo_cat     : chr  "no" "CMF" "Oth" "no" ...
##  $ br_xray       : int  1 1 0 0 1 0 0 0 1 1 ...
##  $ br_xray_dose  : num  1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
##  $ age_menarche  : num  9 13 10 12 10 13 12 11 11 NA ...
##  $ age_1st_ftp   : num  30 0 26 0 17 0 25 28 27 18 ...
##  $ age_menopause : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ num_preg      : num  1 0 1 0 1 0 1 1 1 1 ...
##  $ bmi_age18     : num  20.8 22.5 23.6 18.6 19.9 ...
##  $ bmi_dx        : num  25.8 22.9 28.3 17.8 26.6 ...
##  $ bmi_ref       : num  33.3 22.9 33.1 17.8 29.8 ...
phenotypes.df[1:5,1:5]
##        wes_id  gwas_id       merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568   pass  1
## P1_A02 P1_A02 id357807 P1_A02_id357807   pass  1
## P1_A03 P1_A03 id325472 P1_A03_id325472   pass  1
## P1_A04 P1_A04 id304354 P1_A04_id304354   pass  1
## P1_A05 P1_A05 id222648 P1_A05_id222648   pass  1
dim(variants.df)
## [1] 239642     23
colnames(variants.df)
##  [1] "SplitVarID"         "SYMBOL"             "TYPE"              
##  [4] "CHROM"              "POS"                "REF"               
##  [7] "ALT"                "AC"                 "AF"                
## [10] "AN"                 "Consequence"        "SIFT_call"         
## [13] "SIFT_score"         "PolyPhen_call"      "PolyPhen_score"    
## [16] "CLIN_SIG"           "cDNA_position"      "CDS_position"      
## [19] "Codons"             "Protein_position"   "Amino_acids"       
## [22] "Existing_variation" "Multiallelic"
variants.df[1:5,1:5]
##                SplitVarID        SYMBOL  TYPE CHROM    POS
## Var000000001 Var000000001 RP11-206L10.1 INDEL     1 664486
## Var000000002 Var000000002     LINC00115   SNP     1 762330
## Var000000008 Var000000008        SAMD11   SNP     1 871215
## Var000000020 Var000000020         NOC2L   SNP     1 880461
## Var000000021 Var000000021         NOC2L   SNP     1 880483
dim(kgen.df)
## [1] 239642      9
colnames(kgen.df)
## [1] "SplitVarID"  "kgen.AC"     "kgen.AN"     "kgen.AF"     "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN    kgen.AF kgen.AFR_AF
## Var000000001 Var000000001      NA      NA         NA          NA
## Var000000002 Var000000002      NA      NA         NA          NA
## Var000000008 Var000000008      NA      NA         NA          NA
## Var000000020 Var000000020      NA      NA         NA          NA
## Var000000021 Var000000021      11    5008 0.00219649       8e-04
dim(exac.df)
## [1] 239642     48
colnames(exac.df)
##  [1] "SplitVarID"              "exac_non_TCGA.AF"       
##  [3] "exac_non_TCGA.AC"        "exac_non_TCGA.AN"       
##  [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
##  [7] "exac_non_TCGA.AC_MALE"   "exac_non_TCGA.AN_MALE"  
##  [9] "exac_non_TCGA.AC_Adj"    "exac_non_TCGA.AN_Adj"   
## [11] "exac_non_TCGA.AC_Hom"    "exac_non_TCGA.AC_Het"   
## [13] "exac_non_TCGA.AC_Hemi"   "exac_non_TCGA.AC_AFR"   
## [15] "exac_non_TCGA.AN_AFR"    "exac_non_TCGA.Hom_AFR"  
## [17] "exac_non_TCGA.Het_AFR"   "exac_non_TCGA.Hemi_AFR" 
## [19] "exac_non_TCGA.AC_AMR"    "exac_non_TCGA.AN_AMR"   
## [21] "exac_non_TCGA.Hom_AMR"   "exac_non_TCGA.Het_AMR"  
## [23] "exac_non_TCGA.Hemi_AMR"  "exac_non_TCGA.AC_EAS"   
## [25] "exac_non_TCGA.AN_EAS"    "exac_non_TCGA.Hom_EAS"  
## [27] "exac_non_TCGA.Het_EAS"   "exac_non_TCGA.Hemi_EAS" 
## [29] "exac_non_TCGA.AC_FIN"    "exac_non_TCGA.AN_FIN"   
## [31] "exac_non_TCGA.Hom_FIN"   "exac_non_TCGA.Het_FIN"  
## [33] "exac_non_TCGA.Hemi_FIN"  "exac_non_TCGA.AC_NFE"   
## [35] "exac_non_TCGA.AN_NFE"    "exac_non_TCGA.Hom_NFE"  
## [37] "exac_non_TCGA.Het_NFE"   "exac_non_TCGA.Hemi_NFE" 
## [39] "exac_non_TCGA.AC_SAS"    "exac_non_TCGA.AN_SAS"   
## [41] "exac_non_TCGA.Hom_SAS"   "exac_non_TCGA.Het_SAS"  
## [43] "exac_non_TCGA.Hemi_SAS"  "exac_non_TCGA.AC_OTH"   
## [45] "exac_non_TCGA.AN_OTH"    "exac_non_TCGA.Hom_OTH"  
## [47] "exac_non_TCGA.Het_OTH"   "exac_non_TCGA.Hemi_OTH"
exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000001 Var000000001               NA               NA
## Var000000002 Var000000002        1.200e-02              175
## Var000000008 Var000000008               NA               NA
## Var000000020 Var000000020        2.825e-05                3
## Var000000021 Var000000021        1.422e-03              151
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000001               NA                      NA
## Var000000002            14012                      59
## Var000000008               NA                      NA
## Var000000020           106200                       0
## Var000000021           106186                      60
# Check consistency of colnames and rownames
sum(colnames(genotypes.mx) != rownames(phenotypes.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(variants.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(kgen.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(exac.df))
## [1] 0

recalculate_AFs

Used for selecting common variants for eigenvectors computation.
Will be recalculated later after exclusion of eigenvectors outliers.

# Rename AF fields in the variants table
vars_colnames <- colnames(variants.df)
"ac_raw" -> vars_colnames[ vars_colnames == "AC" ]
"an_raw" -> vars_colnames[ vars_colnames == "AN" ]
"af_raw" -> vars_colnames[ vars_colnames == "AF" ]
vars_colnames -> colnames(variants.df)

# Function to calculate AN
get_allele_number.udf <- function(x){2*sum(!is.na(x))}

# --- Calculate total AFs --- #

# Calculate total ac, an and af
ac_all <- apply(genotypes.mx, 1, sum, na.rm=TRUE)
an_all <- apply(genotypes.mx, 1, get_allele_number.udf)
af_all <- ac_all/an_all

# Add new AFs to the variants table
variants.df <- cbind(variants.df, ac_all, an_all, af_all)

# --- Calculate ubc AFs --- #

# Prepare genotypes table
ubc_cases <- phenotypes.df$cc == 0 
sum(ubc_cases) # 245
## [1] 245
ubc_genotypes.mx <- genotypes.mx[,ubc_cases]
dim(ubc_genotypes.mx)
## [1] 239642    245
# Calculate ubc ac, an and af
ac_ubc <- apply(ubc_genotypes.mx, 1, sum, na.rm=TRUE)
an_ubc <- apply(ubc_genotypes.mx, 1, get_allele_number.udf)
af_ubc <- ac_ubc/an_ubc

# Add updated AFs to variants.df
variants.df <- cbind(variants.df, ac_ubc, an_ubc, af_ubc)

# --- Calculate_cbc_AFs --- #

# Prepare genotypes table
cbc_cases <- phenotypes.df$cc == 1 
sum(cbc_cases) # 235
## [1] 235
cbc_genotypes.mx <- genotypes.mx[,cbc_cases]
dim(cbc_genotypes.mx)
## [1] 239642    235
# Calculate cbc ac, an and af
ac_cbc <- apply(cbc_genotypes.mx, 1, sum, na.rm=TRUE)
an_cbc <- apply(cbc_genotypes.mx, 1, get_allele_number.udf)
af_cbc <- ac_cbc/an_cbc

# Add updated AFs to variants.df
variants.df <- cbind(variants.df, ac_cbc, an_cbc, af_cbc)

# Clean-up
rm(vars_colnames, get_allele_number.udf, ac_all, an_all, af_all, 
   cbc_cases, cbc_genotypes.mx, ac_cbc, an_cbc, af_cbc,
   ubc_cases, ubc_genotypes.mx, ac_ubc, an_ubc, af_ubc)

filter_by_hwe

Using library HardyWeinberg

# Prepare genotypes counts
genotypes_counts <- MakeCounts(t(genotypes.mx),coding=c(0,1,2))
dim(genotypes_counts)
## [1] 239642      4
genotypes_counts[1:10,]
##        AA  AB  BB NA
##  [1,] 427  44   0  9
##  [2,] 471   4   0  5
##  [3,] 414   3   0 63
##  [4,] 477   1   0  2
##  [5,] 478   1   0  1
##  [6,] 478   1   0  1
##  [7,]  55 211 185 29
##  [8,] 463   1   0 16
##  [9,] 457   1   0 22
## [10,] 471   2   0  7
# Calculate HW p-values
hwe <- HWExactStats(genotypes_counts[,1:3], verbose=FALSE)
hwe[1:10]
##  [1] 0.6145350 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.7578755
##  [8] 1.0000000 1.0000000 1.0000000
variants.df <- cbind(variants.df, hwe)

# Select common variants (for QQ plot only)
common_variants <- variants.df$af_all > 0.05 & variants.df$af_all < 0.95
sum(common_variants) # 46,795
## [1] 46795
# Bonferroni-style threshold - too relaxed (EZ)
1/length(hwe) # ~4e-6
## [1] 4.172891e-06
hwe_violators <- hwe < 1/length(hwe) 
sum(hwe_violators) # 605
## [1] 605
# A stronger conventional threshold (10-4, EZ recommended between 5e-4 5e-5)
hwe_violators <- hwe < hwe_th
sum(hwe_violators) # 684
## [1] 684
# QQ-plots for HWE
qqunif.plot(hwe, 
            main="QQ plot for all HWE p-values")
## Loading required package: grid

qqunif.plot(hwe[!hwe_violators], 
            main=paste("QQ plot for HWE p-values",
                "\n excluding HWE violaters (p<10-4)"))

qqunif.plot(hwe[!hwe_violators & common_variants], 
            main=paste("QQ plot for HWE p-values",
                "\n excluding HWE violaters (p<10-4) and rare variants (MAF<5%)"))

# Remove variants violating HWE 
variants.df <- variants.df[!hwe_violators,]
genotypes.mx <- genotypes.mx[!hwe_violators,]
kgen.df <- kgen.df[!hwe_violators,]
exac.df <- exac.df[!hwe_violators,]

# Check results
dim(variants.df)
## [1] 238958     33
# Clean-up
rm(genotypes_counts, hwe, hwe_violators, common_variants, qqunif.plot, hwe_th)

calculate_eigenvectors

Requires source(“f01_calculate_eigenvectors.R”)

Only common variants (0.05 < AF < 0.95 in both CBC and UBC) will be used for eigenvectors calculation.

Note exclusion on both sides: low- and high- AFs:
- Low AFs remove rare variants with common allele in reference genome
- Hight AFs remove rare variants with common allele in reference genome

# --- Make subset of data with common variants only

cbc_common_vars <- variants.df$af_cbc > 0.05 & variants.df$af_cbc < 0.95
sum(cbc_common_vars) # 46,135
## [1] 46135
ubc_common_vars <- variants.df$af_ubc > 0.05 & variants.df$af_ubc < 0.95
sum(ubc_common_vars) # 46,076
## [1] 46076
common_overlap_vars <- cbc_common_vars & ubc_common_vars
sum(common_overlap_vars) # 44,508
## [1] 44508
min(variants.df$af_all[common_overlap_vars]) # ~0.05
## [1] 0.0503212
max(variants.df$af_all[common_overlap_vars]) # ~0.95
## [1] 0.9487179
common_overlap_genotypes.mx <- genotypes.mx[common_overlap_vars,]
dim(common_overlap_genotypes.mx)
## [1] 44508   480
common_overlap_genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000024      2      1     NA      1      2
## Var000000054      2      2     NA      2     NA
## Var000000058      2      2      2      2      2
## Var000000062      0      0      0      0      0
## Var000000109      0      1      0      1      0
# --- Calculate eigenvectors --- #

wecare.eigen <- normalise_and_calculate_eigenvectors.udf(common_overlap_genotypes.mx)

# Clean-up
rm(cbc_common_vars, ubc_common_vars, common_overlap_vars, 
   common_overlap_genotypes.mx, normalise_and_calculate_eigenvectors.udf)

plot_eigenvectors

Note manually coded varaintsxsamples values in figure titles:
need to be corrected manually, if changed

# --- Prepare data for plotting --- #

wecare.eigenvectors.df <- as.data.frame(wecare.eigen$vectors) # eigenvectors in columns

# Prepare colour scale
colours <- c("UBC" = "BLUE", "CBC" = "RED")
userColourScale <- scale_colour_manual(values=colours)

# Prepare cases lables
cases_labels <- as.vector(phenotypes.df$cc)
"CBC" -> cases_labels[cases_labels==1]
"UBC" -> cases_labels[cases_labels==0]

summary(as.factor(cases_labels))
## CBC UBC 
## 235 245
# Prepare cases IDs (for labels on onteractive plot)
cases_IDs <- as.vector(phenotypes.df$wes_id)

# make the dataframe
data2plot.df <- cbind(cases_IDs, cases_labels, wecare.eigenvectors.df[,1:5])
colnames(data2plot.df) <- c("wes_id", "group", "ev1", "ev2", "ev3", "ev4", "ev5")

# --- Plot eig1 vs eig2 --- #

g <- ggplot(data2plot.df, aes(ev1, ev2)) +
  geom_point(aes(colour=group, fill=group, text = wes_id)) + 
  labs(title="wecare common variants<br>(44,508 x 480)", x ="eigenvector1", y = "eigenvector2") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# --- Plot eig2 vs eig3 --- #

g <- ggplot(data2plot.df, aes(ev2, ev3)) +
  geom_point(aes(colour=group, fill=group, text = wes_id)) + 
  labs(title="wecare common variants<br>(44,508 x 480)", x ="eigenvector2", y = "eigenvector3") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# --- Plot eig3 vs eig4 --- #

g <- ggplot(data2plot.df, aes(ev3, ev4)) +
  geom_point(aes(colour=group, fill=group, text = wes_id)) + 
  labs(title="wecare common variants<br>(44,508 x 480)", x ="eigenvector3", y = "eigenvector4") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# --- Plot eig4 vs eig5 --- #

g <- ggplot(data2plot.df, aes(ev4, ev5)) +
  geom_point(aes(colour=group, fill=group, text = wes_id)) + 
  labs(title="wecare common variants<br>(44,508 x 480)", x ="eigenvector4", y = "eigenvector5") +
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# --- Clean-up --- #

rm(wecare.eigenvectors.df, g, data2plot.df, cases_IDs, cases_labels, colours, userColourScale)

calculate_outliers

Explore 6 standard deviations in 5 top eigenvectors

wecare.eigenvectors.mx <- wecare.eigen$vectors # eigenvectors in columns

# No outliers on 1st ev
ev1 <- wecare.eigenvectors.mx[,1]
ev1.positive_outliers <- ev1 > mean(ev1) + 6 * sd(ev1)
ev1.negative_outliers <- ev1 < mean(ev1) - 6 * sd(ev1)
sum(ev1.positive_outliers)
## [1] 0
sum(ev1.negative_outliers)
## [1] 0
phenotypes.df$wes_id[ev1.positive_outliers]
## character(0)
phenotypes.df$wes_id[ev1.negative_outliers]
## character(0)
# 2 outliers on 2nd ev: P5_E09 and P6_D05
ev2 <- wecare.eigenvectors.mx[,2]
ev2.positive_outliers <- ev2 > mean(ev2) + 6 * sd(ev2)
ev2.negative_outliers <- ev2 < mean(ev2) - 6 * sd(ev2)
sum(ev2.positive_outliers)
## [1] 0
sum(ev2.negative_outliers)
## [1] 2
phenotypes.df$wes_id[ev2.positive_outliers]
## character(0)
phenotypes.df$wes_id[ev2.negative_outliers]
## [1] "P5_E09" "P6_D05"
# No outliers on 3rd ev
ev3 <- wecare.eigenvectors.mx[,3]
ev3.positive_outliers <- ev3 > mean(ev3) + 6 * sd(ev3)
ev3.negative_outliers <- ev3 < mean(ev3) - 6 * sd(ev3)
sum(ev3.positive_outliers)
## [1] 0
sum(ev3.negative_outliers)
## [1] 0
phenotypes.df$wes_id[ev3.positive_outliers]
## character(0)
phenotypes.df$wes_id[ev3.negative_outliers]
## character(0)
# 2 outliers on 4th ev: P2_C08 and P4_F10
ev4 <- wecare.eigenvectors.mx[,4]
ev4.positive_outliers <- ev4 > mean(ev4) + 6 * sd(ev4)
ev4.negative_outliers <- ev4 < mean(ev4) - 6 * sd(ev4)
sum(ev4.positive_outliers)
## [1] 2
sum(ev4.negative_outliers)
## [1] 0
phenotypes.df$wes_id[ev4.positive_outliers]
## [1] "P2_C08" "P4_F10"
phenotypes.df$wes_id[ev4.negative_outliers]
## character(0)
# No outliers on 5th ev
ev5 <- wecare.eigenvectors.mx[,5]
ev5.positive_outliers <- ev5 > mean(ev5) + 6 * sd(ev5)
ev5.negative_outliers <- ev5 < mean(ev5) - 6 * sd(ev5)
sum(ev5.positive_outliers)
## [1] 0
sum(ev5.negative_outliers)
## [1] 0
phenotypes.df$wes_id[ev5.positive_outliers]
## character(0)
phenotypes.df$wes_id[ev5.negative_outliers]
## character(0)
# Plot eigenvalues
plot(wecare.eigen$values, main="Eigenvalues, wecare only")

wecare.eigen$values[1:10]
##  [1] 5.090430 4.072666 3.682725 3.174499 3.096095 3.086285 2.999579
##  [8] 2.981127 2.946367 2.919219
# Clean-up
rm(wecare.eigenvectors.mx, 
   ev1, ev1.positive_outliers, ev1.negative_outliers, 
   ev2, ev2.positive_outliers, ev2.negative_outliers, 
   ev3, ev3.positive_outliers, ev3.negative_outliers,
   ev4, ev4.positive_outliers, ev4.negative_outliers,
   ev5, ev5.positive_outliers, ev5.negative_outliers)

data_summary

ls()
## [1] "base_folder"   "exac.df"       "genotypes.mx"  "kgen.df"      
## [5] "phenotypes.df" "variants.df"   "wecare.eigen"
dim(genotypes.mx)
## [1] 238958    480
class(genotypes.mx)
## [1] "matrix"
genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000001      0      1      0      0      0
## Var000000002      0      0      0      0      0
## Var000000008      0      0     NA      0     NA
## Var000000020      0      0      0      0      0
## Var000000021      0      0      0      0      0
dim(phenotypes.df)
## [1] 480  31
str(phenotypes.df)
## 'data.frame':    480 obs. of  31 variables:
##  $ wes_id        : chr  "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##  $ gwas_id       : chr  "id203568" "id357807" "id325472" "id304354" ...
##  $ merged_id     : chr  "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
##  $ filter        : chr  "pass" "pass" "pass" "pass" ...
##  $ cc            : int  1 1 1 1 1 1 0 1 1 0 ...
##  $ setno         : int  203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
##  $ registry      : Factor w/ 5 levels "de","IA","IR",..: 3 4 2 2 5 4 2 4 4 2 ...
##  $ family_history: int  1 0 1 1 1 1 1 1 1 0 ...
##  $ age_dx        : int  49 35 32 33 44 28 28 38 35 36 ...
##  $ age_ref       : num  58 36 41 34 49 28 32 44 35 38 ...
##  $ rstime        : num  10.17 1.83 9.75 1.59 5.66 ...
##  $ eig1_gwas     : num  -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
##  $ eig2_gwas     : num  0.00266 0.00246 -0.0001 0.00595 0.01157 ...
##  $ eig3_gwas     : num  0.06803 0.05055 -0.00603 0.00747 0.00144 ...
##  $ eig4_gwas     : num  -0.03469 -0.01264 -0.01269 -0.01841 0.00663 ...
##  $ eig5_gwas     : num  -0.03845 -0.00424 0.01597 -0.0065 0.01391 ...
##  $ stage         : num  1 2 2 1 1 1 2 1 2 1 ...
##  $ er            : num  NA 0 0 0 NA 1 1 1 1 0 ...
##  $ pr            : num  NA 0 0 NA NA 1 NA 1 0 0 ...
##  $ hist_cat      : chr  "lobular" "ductal" "medullary" "ductal" ...
##  $ hormone       : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ chemo_cat     : chr  "no" "CMF" "Oth" "no" ...
##  $ br_xray       : int  1 1 0 0 1 0 0 0 1 1 ...
##  $ br_xray_dose  : num  1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
##  $ age_menarche  : num  9 13 10 12 10 13 12 11 11 NA ...
##  $ age_1st_ftp   : num  30 0 26 0 17 0 25 28 27 18 ...
##  $ age_menopause : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ num_preg      : num  1 0 1 0 1 0 1 1 1 1 ...
##  $ bmi_age18     : num  20.8 22.5 23.6 18.6 19.9 ...
##  $ bmi_dx        : num  25.8 22.9 28.3 17.8 26.6 ...
##  $ bmi_ref       : num  33.3 22.9 33.1 17.8 29.8 ...
phenotypes.df[1:5,1:5]
##        wes_id  gwas_id       merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568   pass  1
## P1_A02 P1_A02 id357807 P1_A02_id357807   pass  1
## P1_A03 P1_A03 id325472 P1_A03_id325472   pass  1
## P1_A04 P1_A04 id304354 P1_A04_id304354   pass  1
## P1_A05 P1_A05 id222648 P1_A05_id222648   pass  1
dim(variants.df)
## [1] 238958     33
colnames(variants.df)
##  [1] "SplitVarID"         "SYMBOL"             "TYPE"              
##  [4] "CHROM"              "POS"                "REF"               
##  [7] "ALT"                "ac_raw"             "af_raw"            
## [10] "an_raw"             "Consequence"        "SIFT_call"         
## [13] "SIFT_score"         "PolyPhen_call"      "PolyPhen_score"    
## [16] "CLIN_SIG"           "cDNA_position"      "CDS_position"      
## [19] "Codons"             "Protein_position"   "Amino_acids"       
## [22] "Existing_variation" "Multiallelic"       "ac_all"            
## [25] "an_all"             "af_all"             "ac_ubc"            
## [28] "an_ubc"             "af_ubc"             "ac_cbc"            
## [31] "an_cbc"             "af_cbc"             "hwe"
variants.df[1:5,1:5]
##                SplitVarID        SYMBOL  TYPE CHROM    POS
## Var000000001 Var000000001 RP11-206L10.1 INDEL     1 664486
## Var000000002 Var000000002     LINC00115   SNP     1 762330
## Var000000008 Var000000008        SAMD11   SNP     1 871215
## Var000000020 Var000000020         NOC2L   SNP     1 880461
## Var000000021 Var000000021         NOC2L   SNP     1 880483
dim(kgen.df)
## [1] 238958      9
colnames(kgen.df)
## [1] "SplitVarID"  "kgen.AC"     "kgen.AN"     "kgen.AF"     "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN    kgen.AF kgen.AFR_AF
## Var000000001 Var000000001      NA      NA         NA          NA
## Var000000002 Var000000002      NA      NA         NA          NA
## Var000000008 Var000000008      NA      NA         NA          NA
## Var000000020 Var000000020      NA      NA         NA          NA
## Var000000021 Var000000021      11    5008 0.00219649       8e-04
dim(exac.df)
## [1] 238958     48
colnames(exac.df)
##  [1] "SplitVarID"              "exac_non_TCGA.AF"       
##  [3] "exac_non_TCGA.AC"        "exac_non_TCGA.AN"       
##  [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
##  [7] "exac_non_TCGA.AC_MALE"   "exac_non_TCGA.AN_MALE"  
##  [9] "exac_non_TCGA.AC_Adj"    "exac_non_TCGA.AN_Adj"   
## [11] "exac_non_TCGA.AC_Hom"    "exac_non_TCGA.AC_Het"   
## [13] "exac_non_TCGA.AC_Hemi"   "exac_non_TCGA.AC_AFR"   
## [15] "exac_non_TCGA.AN_AFR"    "exac_non_TCGA.Hom_AFR"  
## [17] "exac_non_TCGA.Het_AFR"   "exac_non_TCGA.Hemi_AFR" 
## [19] "exac_non_TCGA.AC_AMR"    "exac_non_TCGA.AN_AMR"   
## [21] "exac_non_TCGA.Hom_AMR"   "exac_non_TCGA.Het_AMR"  
## [23] "exac_non_TCGA.Hemi_AMR"  "exac_non_TCGA.AC_EAS"   
## [25] "exac_non_TCGA.AN_EAS"    "exac_non_TCGA.Hom_EAS"  
## [27] "exac_non_TCGA.Het_EAS"   "exac_non_TCGA.Hemi_EAS" 
## [29] "exac_non_TCGA.AC_FIN"    "exac_non_TCGA.AN_FIN"   
## [31] "exac_non_TCGA.Hom_FIN"   "exac_non_TCGA.Het_FIN"  
## [33] "exac_non_TCGA.Hemi_FIN"  "exac_non_TCGA.AC_NFE"   
## [35] "exac_non_TCGA.AN_NFE"    "exac_non_TCGA.Hom_NFE"  
## [37] "exac_non_TCGA.Het_NFE"   "exac_non_TCGA.Hemi_NFE" 
## [39] "exac_non_TCGA.AC_SAS"    "exac_non_TCGA.AN_SAS"   
## [41] "exac_non_TCGA.Hom_SAS"   "exac_non_TCGA.Het_SAS"  
## [43] "exac_non_TCGA.Hemi_SAS"  "exac_non_TCGA.AC_OTH"   
## [45] "exac_non_TCGA.AN_OTH"    "exac_non_TCGA.Hom_OTH"  
## [47] "exac_non_TCGA.Het_OTH"   "exac_non_TCGA.Hemi_OTH"
exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000001 Var000000001               NA               NA
## Var000000002 Var000000002        1.200e-02              175
## Var000000008 Var000000008               NA               NA
## Var000000020 Var000000020        2.825e-05                3
## Var000000021 Var000000021        1.422e-03              151
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000001               NA                      NA
## Var000000002            14012                      59
## Var000000008               NA                      NA
## Var000000020           106200                       0
## Var000000021           106186                      60
str(wecare.eigen)
## List of 2
##  $ values : num [1:480] 5.09 4.07 3.68 3.17 3.1 ...
##  $ vectors: num [1:480, 1:480] -0.15651 -0.14148 -0.00298 -0.00799 0.0193 ...
##  - attr(*, "class")= chr "eigen"
sum(colnames(genotypes.mx) != rownames(phenotypes.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(variants.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(kgen.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(exac.df))
## [1] 0

save_data

#base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_08.17"
save.image(paste(base_folder, "results", "r05_calculate_hw_and_egenvectors_wecare_only.RData", sep="/"))

final_section

ls()
## [1] "base_folder"   "exac.df"       "genotypes.mx"  "kgen.df"      
## [5] "phenotypes.df" "variants.df"   "wecare.eigen"
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] lattice_0.20-35     HardyWeinberg_1.5.8 mice_2.30          
## [4] plotly_4.7.1        ggplot2_2.2.1       knitr_1.16         
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.11      compiler_3.4.1    plyr_1.8.4       
##  [4] bindr_0.1         tools_3.4.1       rpart_4.1-11     
##  [7] digest_0.6.12     jsonlite_1.5      evaluate_0.10.1  
## [10] tibble_1.3.3      gtable_0.2.0      viridisLite_0.2.0
## [13] pkgconfig_2.0.1   rlang_0.1.1       Matrix_1.2-10    
## [16] shiny_1.0.5       crosstalk_1.0.0   yaml_2.1.14      
## [19] bindrcpp_0.2      dplyr_0.7.1       stringr_1.2.0    
## [22] httr_1.3.1        htmlwidgets_0.9   nnet_7.3-12      
## [25] rprojroot_1.2     glue_1.1.1        data.table_1.10.4
## [28] R6_2.2.2          survival_2.41-3   rmarkdown_1.6    
## [31] purrr_0.2.3       tidyr_0.7.0       magrittr_1.5     
## [34] splines_3.4.1     backports_1.1.0   scales_0.4.1     
## [37] htmltools_0.3.6   MASS_7.3-47       assertthat_0.2.0 
## [40] xtable_1.8-2      mime_0.5          colorspace_1.3-2 
## [43] httpuv_1.3.5      labeling_0.3      stringi_1.1.5    
## [46] lazyeval_0.2.0    munsell_0.4.3
Sys.time()
## [1] "2017-08-27 19:11:50 BST"